Adel Abu Hashim - Oct 2020
This case study aims to help Amber Heard
By analyzing new accounts posting/ commenting against a victim of a Social Bot Disinformation/Influence Operation.
We have three main datasets:
(The datasets screaped from reddit).
- 1- A dataset with submissions & comments data (2019).
- 2- Users Data (from 2006 to 2019).
- 3- A merged dataset (submissions & comments data, users data).
- 4- Daily creation data (# of accounts created per day from 2006 to 2019)
#import dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import helpers
import matplotlib.dates as mdates
import plotly.express as px
import plotly.graph_objects as go
import re
import warnings
warnings.filterwarnings('ignore')
sb.set_style("darkgrid")
%matplotlib inline
# load data
df = pd.read_csv("cleaned_data/reddit_cleaned_2019.csv")
df_merged = pd.read_csv("cleaned_data/reddit_merged_2019.csv")
# convert to datetime
df.created_at = pd.to_datetime(df.created_at)
df_merged.created_at = pd.to_datetime(df_merged.created_at)
df_merged.user_created_at = pd.to_datetime(df_merged.user_created_at)
print(df.shape)
df.head(2);
(23678, 17)
print(df_merged.shape)
df_merged.head(2);
(23678, 24)
# Filter on banned accounts
df_banned = df_merged[df_merged['is_banned']]
print(df_banned.shape)
df_banned.head(2);
(1875, 24)
Note: we only have user names for the banned accounts
# Filter on unverified accounts
df_unverified = df_merged[~df_merged['has_verified_email']]
print(df_unverified.shape)
df_unverified.head(2);
(2819, 24)
# Filter on Accounts created in 2018
df_18 = df_merged[df_merged['user_created_at'].dt.year == 2018]
print(df_18.shape)
df_18.head(2);
(3357, 24)
# Filter on Accounts created in 2019
df_19 = df_merged[df_merged['user_created_at'].dt.year == 2019]
print(df_19.shape)
df_19.head(2);
(2270, 24)
peak_day = '2019-03-15'
# Filter on Peak Day
df_peak = df_merged[df_merged['created_at'].dt.date.astype('str') == peak_day]
df_peak_submissions = df_peak.query("submission_comment == 'submission'")
# Filter on Peak Day For Unverified accounts
df_unverified_peak = df_unverified[df_unverified['created_at'].dt.date.astype('str') == peak_day]
df_unverified_peak_submissions = df_unverified_peak.query("submission_comment == 'submission'")
# Filter on Submissions
df_submissions = df_merged.query("submission_comment == 'submission'")
df_unverified_submissions = df_unverified.query("submission_comment == 'submission'")
colors = px.colors.qualitative.T10
fig = px.pie(df_merged.banned_unverified.value_counts().to_frame().reset_index(),
values='banned_unverified', names='index', color_discrete_sequence = colors,
title = 'Contributions of banned / unverified /others in 2019')
fig.update_traces(textposition='inside', textinfo='percent+label+value')
fig.show()